Disclaimer :

The main aim of this activity is just to create a dataset of synonyms and antonyms of english words.
All the data belongs to its original creator at https://www.thesaurus.com/


Execution Flow :

Note:
Synonym and antonym should be placed in same row as a single entry with words separated by ‘;’.


Loading the libraries

library(rvest)
package 㤼㸱rvest㤼㸲 was built under R version 3.5.3Loading required package: xml2
package 㤼㸱xml2㤼㸲 was built under R version 3.5.3
library(stringr)
package 㤼㸱stringr㤼㸲 was built under R version 3.5.3

Load the data for letter

I created this dataset displaying how many webpages are available for all words starting with particular letter.

thesaurus_data <- read.csv("~/R/Thesaurus_web_scraping/thesaurus_letter_page_number.csv")
print(thesaurus_data)

Get all the words from website

word_data <- data.frame(word=character(0))
total_letters <- 26

Following loop will call for each link and will get all the words for all 26 letters.
This took me about 6 minutes 30 seconds.

for (i in 1:total_letters) {
  base_url <- as.character(thesaurus_data[i,2])
  letter_total_page <- thesaurus_data[i,3]
  letter_page_link <- character(length(letter_total_page))
  for(j in 1:letter_total_page) {
    letter_url <- paste(base_url,j,sep = "")
    letter_webpage <- read_html(letter_url)
    letter_html_words <- html_nodes(letter_webpage, '.e1j8zk4s1')
    letter_page_words <- html_text(letter_html_words)
    letter_page_words <- as.data.frame(letter_page_words,row.names = NULL)
    word_data <- rbind(word_data, letter_page_words)
  }
}

All the words that we got till now are:

nrow(word_data)
[1] 133322
head(word_data)

Get the synonym and antonym for the each word

total_word <- nrow(word_data)
browse_url <- "https://www.thesaurus.com/browse/"
thesaurus <- data.frame(word=character(0),browser_word=character(0),
                       url=character(0),synoyms=character(0),antonyms=character(0))

This took around 52 hours 8 minutes to complete the scraping

for (i in 131903:total_word) {
  actual_word <- word_data[i,1]
  current_word <- str_replace_all(actual_word, " ","%20")
  current_word <- str_replace_all(current_word, "'","%27")
  current_url <-paste(browse_url,current_word,sep = "")
  word_webpage <- read_html(current_url)
  
  #Get all synonym words
  syn_words_html <- html_nodes(word_webpage, '.en1b8750+ .e1qo4u830 .et6tpn80')
  syn_words <- html_text(html_children(syn_words_html[1]))
  
  #Get all antonym words
  ant_words_html <- html_nodes(word_webpage, '.em66cyi0+ .e1qo4u830 .et6tpn80')
  ant_words <- html_text(html_children(ant_words_html[1]))
  
  #update the dataframe
  synoyms <- toString(syn_words)
  antonyms <- toString(ant_words)
  temp_data <- data.frame(actual_word,current_word,current_url,synoyms,antonyms)
  thesaurus <- rbind(thesaurus,temp_data)
  
  print(paste0("passed number: ", i, " word : ", current_word))
}
head(thesaurus)

This concludes the exercise.

Reference:
* https://www.analyticsvidhya.com/blog/2017/03/beginners-guide-on-web-scraping-in-r-using-rvest-with-hands-on-knowledge/


LS0tDQp0aXRsZTogIlNjcmFwaW5nIHRoZSBUaGVzYXVydXMuY29tIg0Kb3V0cHV0OiBodG1sX25vdGVib29rDQotLS0NCiAgDQotLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLSAgDQoNCiMjIyMgRGlzY2xhaW1lciA6DQpUaGUgbWFpbiBhaW0gb2YgdGhpcyBhY3Rpdml0eSBpcyBqdXN0IHRvIGNyZWF0ZSBhIGRhdGFzZXQgb2Ygc3lub255bXMgYW5kIGFudG9ueW1zIG9mIGVuZ2xpc2ggd29yZHMuICANCkFsbCB0aGUgZGF0YSBiZWxvbmdzIHRvIGl0cyBvcmlnaW5hbCBjcmVhdG9yIGF0IFtodHRwczovL3d3dy50aGVzYXVydXMuY29tL10oaHR0cHM6Ly93d3cudGhlc2F1cnVzLmNvbS8pDQoNCi0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tDQogIA0KIyMjIEV4ZWN1dGlvbiBGbG93IDogDQoqIExvYWQgdGhlIGRhdGFzZXQgZm9yIGxldHRlciBhbmQgbGlua3MgYW5kIHRvdGFsIHBhZ2UgbnVtYmVycyBmb3IgZWFjaCBsZXR0ZXINCiogR2V0IGFsbCB0aGUgd29yZHMgZm9yIGxldHRlcg0KKiBGb3IgZWFjaCB3b3JkLCBnZXQgdGhlIGZpcnN0IHN5bm9ueW0gYW5kIGFudG9ueW0NCiogRmluYWwgb3V0cHV0IHNob3VsZCBiZSBhIGRhdGFmcmFtZSBjb25zaXN0aW5nIG9mIGZvbGxvd2luZyBjb2x1bW5zOg0KICAgICB8IFdvcmQgfCBTeW5vbnltIHwgQW50b255bSB8IExpbmsgfA0KICANCiAgDQoqKk5vdGU6ICoqICANClN5bm9ueW0gYW5kIGFudG9ueW0gc2hvdWxkIGJlIHBsYWNlZCBpbiBzYW1lIHJvdyBhcyBhIHNpbmdsZSBlbnRyeSB3aXRoIHdvcmRzIHNlcGFyYXRlZCBieSAnOycuDQogIA0KLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0NCiAgDQojIyMgTG9hZGluZyB0aGUgbGlicmFyaWVzDQpgYGB7cn0NCmxpYnJhcnkocnZlc3QpDQpsaWJyYXJ5KHN0cmluZ3IpDQpgYGANCiAgDQotLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLQ0KICANCiMjIyBMb2FkIHRoZSBkYXRhIGZvciBsZXR0ZXINCg0KSSBjcmVhdGVkIHRoaXMgZGF0YXNldCBkaXNwbGF5aW5nIGhvdyBtYW55IHdlYnBhZ2VzIGFyZSBhdmFpbGFibGUgZm9yIGFsbCB3b3JkcyBzdGFydGluZyB3aXRoIHBhcnRpY3VsYXIgbGV0dGVyLg0KDQpgYGB7cn0NCnRoZXNhdXJ1c19kYXRhIDwtIHJlYWQuY3N2KCJ+L1IvVGhlc2F1cnVzX3dlYl9zY3JhcGluZy90aGVzYXVydXNfbGV0dGVyX3BhZ2VfbnVtYmVyLmNzdiIpDQpwcmludCh0aGVzYXVydXNfZGF0YSkNCmBgYA0KICAgIA0KLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0NCiAgDQojIyMgR2V0IGFsbCB0aGUgd29yZHMgZnJvbSB3ZWJzaXRlDQoNCmBgYHtyfQ0Kd29yZF9kYXRhIDwtIGRhdGEuZnJhbWUod29yZD1jaGFyYWN0ZXIoMCkpDQp0b3RhbF9sZXR0ZXJzIDwtIDI2DQpgYGANCiAgDQpGb2xsb3dpbmcgbG9vcCB3aWxsIGNhbGwgZm9yIGVhY2ggbGluayBhbmQgd2lsbCBnZXQgYWxsIHRoZSB3b3JkcyBmb3IgYWxsIDI2IGxldHRlcnMuICANClRoaXMgdG9vayBtZSBhYm91dCA2IG1pbnV0ZXMgMzAgc2Vjb25kcy4gIA0KYGBge3J9DQpmb3IgKGkgaW4gMTp0b3RhbF9sZXR0ZXJzKSB7DQogIGJhc2VfdXJsIDwtIGFzLmNoYXJhY3Rlcih0aGVzYXVydXNfZGF0YVtpLDJdKQ0KICBsZXR0ZXJfdG90YWxfcGFnZSA8LSB0aGVzYXVydXNfZGF0YVtpLDNdDQogIGxldHRlcl9wYWdlX2xpbmsgPC0gY2hhcmFjdGVyKGxlbmd0aChsZXR0ZXJfdG90YWxfcGFnZSkpDQogIGZvcihqIGluIDE6bGV0dGVyX3RvdGFsX3BhZ2UpIHsNCiAgICBsZXR0ZXJfdXJsIDwtIHBhc3RlKGJhc2VfdXJsLGosc2VwID0gIiIpDQogICAgbGV0dGVyX3dlYnBhZ2UgPC0gcmVhZF9odG1sKGxldHRlcl91cmwpDQogICAgbGV0dGVyX2h0bWxfd29yZHMgPC0gaHRtbF9ub2RlcyhsZXR0ZXJfd2VicGFnZSwgJy5lMWo4ems0czEnKQ0KICAgIGxldHRlcl9wYWdlX3dvcmRzIDwtIGh0bWxfdGV4dChsZXR0ZXJfaHRtbF93b3JkcykNCiAgICBsZXR0ZXJfcGFnZV93b3JkcyA8LSBhcy5kYXRhLmZyYW1lKGxldHRlcl9wYWdlX3dvcmRzLHJvdy5uYW1lcyA9IE5VTEwpDQogICAgd29yZF9kYXRhIDwtIHJiaW5kKHdvcmRfZGF0YSwgbGV0dGVyX3BhZ2Vfd29yZHMpDQogIH0NCn0NCmBgYA0KICANCkFsbCB0aGUgd29yZHMgdGhhdCB3ZSBnb3QgdGlsbCBub3cgYXJlOg0KYGBge3J9DQpucm93KHdvcmRfZGF0YSkNCmhlYWQod29yZF9kYXRhKQ0KYGBgDQogICAgDQotLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLQ0KICANCiMjIyBHZXQgdGhlIHN5bm9ueW0gYW5kIGFudG9ueW0gZm9yIHRoZSBlYWNoIHdvcmQNCg0KYGBge3J9DQp0b3RhbF93b3JkIDwtIG5yb3cod29yZF9kYXRhKQ0KYnJvd3NlX3VybCA8LSAiaHR0cHM6Ly93d3cudGhlc2F1cnVzLmNvbS9icm93c2UvIg0KdGhlc2F1cnVzIDwtIGRhdGEuZnJhbWUod29yZD1jaGFyYWN0ZXIoMCksYnJvd3Nlcl93b3JkPWNoYXJhY3RlcigwKSwNCiAgICAgICAgICAgICAgICAgICAgICAgdXJsPWNoYXJhY3RlcigwKSxzeW5veW1zPWNoYXJhY3RlcigwKSxhbnRvbnltcz1jaGFyYWN0ZXIoMCkpDQpgYGANCiAgICANCiAgICANClRoaXMgdG9vayBhcm91bmQgNTIgaG91cnMgOCBtaW51dGVzIHRvIGNvbXBsZXRlIHRoZSBzY3JhcGluZw0KYGBge3J9DQpmb3IgKGkgaW4gMTMxOTAzOnRvdGFsX3dvcmQpIHsNCiAgYWN0dWFsX3dvcmQgPC0gd29yZF9kYXRhW2ksMV0NCiAgY3VycmVudF93b3JkIDwtIHN0cl9yZXBsYWNlX2FsbChhY3R1YWxfd29yZCwgIiAiLCIlMjAiKQ0KICBjdXJyZW50X3dvcmQgPC0gc3RyX3JlcGxhY2VfYWxsKGN1cnJlbnRfd29yZCwgIiciLCIlMjciKQ0KICBjdXJyZW50X3VybCA8LXBhc3RlKGJyb3dzZV91cmwsY3VycmVudF93b3JkLHNlcCA9ICIiKQ0KICB3b3JkX3dlYnBhZ2UgPC0gcmVhZF9odG1sKGN1cnJlbnRfdXJsKQ0KICANCiAgI0dldCBhbGwgc3lub255bSB3b3Jkcw0KICBzeW5fd29yZHNfaHRtbCA8LSBodG1sX25vZGVzKHdvcmRfd2VicGFnZSwgJy5lbjFiODc1MCsgLmUxcW80dTgzMCAuZXQ2dHBuODAnKQ0KICBzeW5fd29yZHMgPC0gaHRtbF90ZXh0KGh0bWxfY2hpbGRyZW4oc3luX3dvcmRzX2h0bWxbMV0pKQ0KICANCiAgI0dldCBhbGwgYW50b255bSB3b3Jkcw0KICBhbnRfd29yZHNfaHRtbCA8LSBodG1sX25vZGVzKHdvcmRfd2VicGFnZSwgJy5lbTY2Y3lpMCsgLmUxcW80dTgzMCAuZXQ2dHBuODAnKQ0KICBhbnRfd29yZHMgPC0gaHRtbF90ZXh0KGh0bWxfY2hpbGRyZW4oYW50X3dvcmRzX2h0bWxbMV0pKQ0KICANCiAgI3VwZGF0ZSB0aGUgZGF0YWZyYW1lDQogIHN5bm95bXMgPC0gdG9TdHJpbmcoc3luX3dvcmRzKQ0KICBhbnRvbnltcyA8LSB0b1N0cmluZyhhbnRfd29yZHMpDQogIHRlbXBfZGF0YSA8LSBkYXRhLmZyYW1lKGFjdHVhbF93b3JkLGN1cnJlbnRfd29yZCxjdXJyZW50X3VybCxzeW5veW1zLGFudG9ueW1zKQ0KICB0aGVzYXVydXMgPC0gcmJpbmQodGhlc2F1cnVzLHRlbXBfZGF0YSkNCiAgDQogIHByaW50KHBhc3RlMCgicGFzc2VkIG51bWJlcjogIiwgaSwgIiB3b3JkIDogIiwgY3VycmVudF93b3JkKSkNCn0NCmBgYA0KICAgIA0KICANCmBgYHtyfQ0KaGVhZCh0aGVzYXVydXMpDQpgYGANCg0KICAgIA0KLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0NCiAgDQpUaGlzIGNvbmNsdWRlcyB0aGUgZXhlcmNpc2UuICANCg0KKipSZWZlcmVuY2U6KiogIA0KKiBodHRwczovL3d3dy5hbmFseXRpY3N2aWRoeWEuY29tL2Jsb2cvMjAxNy8wMy9iZWdpbm5lcnMtZ3VpZGUtb24td2ViLXNjcmFwaW5nLWluLXItdXNpbmctcnZlc3Qtd2l0aC1oYW5kcy1vbi1rbm93bGVkZ2UvICANCiAgICANCi0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tDQo=